Testing primers against PR2
Testing primers against PR2
1 Goal
- Testing primers against the PR2 database (latest version 4.11.1)
2 Notes about script
- Chunks
compute_matchesandstore_matcheshave only to be run in 2 cases- new version of pr2
- new set of primers
- In the other case, they can be set eval=FALSE
2.1 To do
- Check why there are some negative amplicon (probably some matches ahead - Put a condition that amplicon should be >)
3 Initialize
Load the variables common to the different scripts and the necessary libraries
# Libraries for bioinfo ----------------------------------------------------
library("Biostrings")
# Libraries tidyr ---------------------------------------------------------
library("ggplot2")
library("dplyr")
library("readxl")
library("tibble")
library("readr")
library("purrr")
library("forcats")
library("lubridate")
library("stringr")
# Libraries dvutils and pr2database -------------------------------------------------------
if(any(grepl("package:dvutils", search()))) detach("package:dvutils", unload=TRUE)
library("dvutils")
if(any(grepl("package:pr2database", search()))) detach("package:pr2database", unload=TRUE)
library("pr2database")
# Options for knitting the report -------------
library(knitr)
library(rmdformats)
library(kableExtra)
knitr::opts_chunk$set(fig.width=8,
fig.height=6,
eval=TRUE,
cache=TRUE,
echo=TRUE,
prompt=FALSE,
tidy=TRUE,
comment=NA,
message=FALSE,
warning=FALSE)
opts_knit$set(width=90)
options(max.print="500")
options(knitr.kable.NA = '')4 Read primer file
4.1 Verify new primer labelling - DO NOT RERUN
primer_sets <- read_excel(path = "primers_18S.xlsx", sheet = "primer sets")
primers <- read_excel(path = "primers_18S.xlsx", sheet = "primers")
primers_fwd <- left_join(primer_sets, primers, by = c(fwd_id = "primer_id")) %>% filter(fwd_seq !=
sequence)
primer_rev <- left_join(primer_sets, primers, by = c(rev_id = "primer_id")) %>% filter(rev_seq !=
sequence)4.2 Compute position on yeast - DO NOT RERUN
primers <- read_excel(path = "primers_18S.xlsx", sheet = "primers")
ref_seq <- pr2 %>% filter(pr2_accession == "FU970071.1.1799_U")
# Matches the position of the primers on the yeast sequence - use map2_dfr to get an data
# frame on output and not a list - use ~ when defining the function
primers_pos <- map2_dfr(primers$sequence, primers$direction, ~get_primer_position(.x, ref_seq$sequence,
orientation = .y, mismatches = 3))
primers <- bind_cols(primers, primers_pos)
write_tsv(primers, path = "primers_matches.tsv", na = "")4.3 Build the primer set table 1
primer_sets <- read_excel(path = "primers_18S.xlsx", sheet = "primer_sets")
primers <- read_excel(path = "primers_18S.xlsx", sheet = "primers")
regions = c("V4", "V4-specific", "V9", "Universal")
# Just keep the selected primers (V4, V9 etc..)
primer_sets <- primer_sets %>% filter(region %in% regions)
primer_sets <- primer_sets %>% left_join(select(primers, primer_id, fwd_name = name, fwd_seq = sequence,
fwd_start_yeast = start_yeast, fwd_end_yeast = end_yeast), by = c(fwd_id = "primer_id")) %>%
left_join(select(primers, primer_id, rev_name = name, rev_seq = sequence, rev_start_yeast = start_yeast,
rev_end_yeast = end_yeast), by = c(rev_id = "primer_id")) %>% mutate(length_yeast = rev_end_yeast -
fwd_start_yeast + 1) %>% select(region:primer_set_name, contains("fwd"), contains("rev"),
length_yeast, used_in:remark)
write_tsv(primer_sets, path = "primers_Table_1.tsv", na = "")
knitr::kable(select(primer_sets, region:primer_set_name, fwd_name, rev_name, length_yeast)) %>%
kableExtra::kable_styling()| region | specificity | primer_set_id | primer_set_name | fwd_name | rev_name | length_yeast |
|---|---|---|---|---|---|---|
| V4 | 1 | Hadziavdic1 | F-566 | R-1200 | 635 | |
| V4 | 2 | Hadziavdic2 | A-528F | R-952 | 379 | |
| V4 | 3 | Hugerth1 | 574*f | 1132r | 576 | |
| V4 | 4 | Hugerth2 | 563f | 1132r | 587 | |
| V4 | 5 | Hugerth3 | 616f | 1132r | 534 | |
| V4 | 6 | Hugerth4 | 616*f | 1132r | 534 | |
| V4 | 7 | Bass | V4_1f | TAReukREV3 | 417 | |
| V4 | 8 | StoeckV4 | TAReuk454FWD1 | TAReukREV3 | 417 | |
| V4 | 36 | StoeckV4 | TAReuk454FWD1 | V4RB | 417 | |
| V4 | 12 | Geisen | 3NDF | 1132r | 599 | |
| V4 | 13 | Brate1 | 3NDF | V4_euk_R1 | 465 | |
| V4 | 14 | Brate2 | 3NDF | V4_euk_R2 | 458 | |
| V4 | 15 | Moreno | EUKAF | EUKAR | 410 | |
| V4 | 16 | PireddaV4 | TAReuk454FWD1 | V4 18S Next.Rev | 417 | |
| V4 | 17 | Comeau | E572F | E1009R | 438 | |
| V4 | 18 | Parfrey | 515f | 1119r | 598 | |
| V4 | 22 | Kim | Nex_18S_0587_F | Nex_18S_0964_R | 419 | |
| V4 | 23 | Venter | 590F | 1300R | 720 | |
| V4 | 24 | Simon | EK-565F-NGS | UNonMetR | 527 | |
| V4 | 25 | Mangot | NSF563 | NSR951 | 380 | |
| V4 | 26 | Pawlowski | Nex_18S_0587_F | S12.2R | 445 | |
| V4 | 34 | Lambert | 515F Univ short | NSR951 | 391 | |
| V4 | 40 | Zhan | Uni18SF | Uni18SR | 461 | |
| V4-specific | diatoms | 21 | Zimmerman | D512for | D978rev | 437 |
| V4-specific | cercozoa | 36 | Harder | Cerc479F | Cerc750R | 283 |
| V4-specific | ciliates | 19 | Vannini | Claudia Vannini (F) | Claudia Vannini (R) | 439 |
| V4-specific | diplonemids | 37 | Cannon | DimA | DimB | 295 |
| V4-specific | chlorophyta | 38 | Moro | ChloroF | ChloroR | 494 |
| V4-specific | haptophyta | 39 | Egge | A-528F | PRYM01+7 | 396 |
| V4-specific | non-metazoa | 35 | Bower | UNonMetF | UNonMetR | 578 |
| V9 | 27 | StoeckV9 | 1391F | EukB | 169 | |
| V9 | 28 | Amaral1 | 1380F | 1510R | 176 | |
| V9 | 29 | Amaral2 | 1389F | 1510R | 167 | |
| V9 | 31 | PireddaV9 | 1388F | 1510R | 167 | |
| Universal | 32 | Wilkins | 926wF | 1392-R | 513 | |
| Universal | 33 | Needham | 515f Univ | 926R | 589 |
5 Computing the matches
This part is done by an R script PR2 Primers pr2_match.R that is executed in batch mode.
5.1 Programing Notes
- Use Biostrings
Accessor methods : In the code snippets below, x is an MIndex object.
- length(x): The number of patterns that matches are stored for.
- names(x): The names of the patterns that matches are stored for.
- startIndex(x): A list containing the starting positions of the matches for each pattern.
- endIndex(x): A list containing the ending positions of the matches for each pattern.
- elementNROWS(x): An integer vector containing the number of matches for each pattern.
- x[[i]]: Extract the matches for the i-th pattern as an IRanges object.
- unlist(x, recursive=TRUE, use.names=TRUE): Return all the matches in a single IRanges object. recursive and use.names are ignored.
- extractAllMatches(subject, mindex): Return all the matches in a single XStringViews object.
One could also use another function which does not give the position * match_fwd <- vcountPattern(fwd, seq,max.mismatch=0, min.mismatch=0, with.indels=FALSE, fixed=FALSE, algorithm=“auto”)
6 Load the data file
This avoids recomputing each time
7 Amplicon length
7.1 Average size
ggplot(pr2_match_final, aes(x = primer_label, y = ampli_size, group = primer_set_id)) + geom_boxplot(outlier.alpha = 0.3) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + xlab("Primer set")7.2 Plot an example of amplicon distribution
for (one_primer_set in c(16)) {
g <- ggplot(filter(pr2_match_final, primer_set_id == one_primer_set), aes(x = ampli_size)) +
geom_density(fill = "blue", alpha = 0.9) + xlab("Amplicon size") + ggtitle(str_c("Primer set - ",
one_primer_set)) + xlim(350, 600)
# print(g)
g <- ggplot(filter(pr2_match_final, primer_set_id == one_primer_set), aes(x = ampli_size,
fill = supergroup)) + geom_density(alpha = 0.9) + theme_bw() + scale_fill_viridis_d() +
xlab("Amplicon size") + ggtitle(str_c("Primer set - ", one_primer_set)) + xlim(350,
600)
print(g)
}8 Summarize the data in tables
8.1 Summarize all eukaryotes
pr2_match_summary_primer_set <- pr2_match_final %>% filter(sequence_length >= 1500) %>% group_by(region,
primer_label, primer_set_id) %>% summarize(pct_fwd = sum(!is.na(fwd_pos))/n() * 100, pct_rev = sum(!is.na(rev_pos))/n() *
100, pct_amplicons = sum(!is.na(ampli_size))/n() * 100, ampli_size_mean = mean(ampli_size,
na.rm = TRUE), ampli_size_sd = sd(ampli_size, na.rm = TRUE), ampli_size_max = max(ampli_size,
na.rm = TRUE), ampli_size_min = min(ampli_size, na.rm = TRUE), n_seq = n())
write_tsv(pr2_match_summary_primer_set, "pr2_match_summary_primer_set.tsv", na = "")
# Long form for Number of sequences
# This dataframe is used to re-order the bars correctly
pct_category_order <- data.frame(pct_category = c("pct_amplicons", "pct_fwd", "pct_rev"), pct_category_order = c(3,
1, 2))
pr2_match_summary_primer_set_long <- pr2_match_summary_primer_set %>% tidyr::gather("pct_category",
"pct_seq", pct_fwd:pct_amplicons) %>% left_join(pct_category_order)8.2 Summarize per supergroup
pr2_match_summary_primer_set_sg <- pr2_match_final %>% filter(sequence_length >= 1500) %>%
group_by(supergroup, region, primer_label, primer_set_id) %>% summarize(pct_fwd = sum(!is.na(fwd_pos))/n() *
100, pct_rev = sum(!is.na(rev_pos))/n() * 100, pct_amplicons = sum(!is.na(ampli_size))/n() *
100, ampli_size_mean = case_when(pct_amplicons > 0 ~ mean(ampli_size, na.rm = TRUE)), ampli_size_sd = case_when(pct_amplicons >
0 ~ sd(ampli_size, na.rm = TRUE)), ampli_size_max = case_when(pct_amplicons > 0 ~ max(ampli_size,
na.rm = TRUE)), ampli_size_min = case_when(pct_amplicons > 0 ~ min(ampli_size, na.rm = TRUE)),
n_seq = n()) %>% ungroup()
write_tsv(pr2_match_summary_primer_set_sg, "pr2_match_summary_primer_set_per_sg.tsv", na = "")8.3 Summarize per class
pr2_match_summary_primer_set_class <- pr2_match_final %>% filter(sequence_length >= 1500) %>%
group_by(supergroup, division, class, region, primer_label, primer_set_id) %>% summarize(pct_fwd = sum(!is.na(fwd_pos))/n() *
100, pct_rev = sum(!is.na(rev_pos))/n() * 100, pct_amplicons = sum(!is.na(ampli_size))/n() *
100, ampli_size_mean = case_when(pct_amplicons > 0 ~ mean(ampli_size, na.rm = TRUE)), ampli_size_sd = case_when(pct_amplicons >
0 ~ sd(ampli_size, na.rm = TRUE)), ampli_size_max = case_when(pct_amplicons > 0 ~ max(ampli_size,
na.rm = TRUE)), ampli_size_min = case_when(pct_amplicons > 0 ~ min(ampli_size, na.rm = TRUE)),
n_seq = n())
write_tsv(pr2_match_summary_primer_set_class, "pr2_match_summary_primer_set_per_class.tsv",
na = "")9 Graphics
9.1 All Eukaryotes
Comments
- Percent of sequences amplified
- Logically, the % of seq amplified is always < min(% of sequences matching forwar, % of sequences matching reverse)
- In general it is the reverse primer that causes problems.
- Some primer sets do not amplify any sequence (11, 19, 20, 21)
- For V9, primer set 30 reverse primer is in the ITS region which is not present in the PR2 sequences, so no amplification.
- Amplicon sizes
- Only 8 V4 primer sets are suitable for Illumina 2x250 (max amplicon size = 450 bp)
- This is if we consider the mean… If we consider the variation around the mean then, only 3 suitable for 2x250
- Five more V4 primer sets are suitable for Illumina 3x250 (max amplicon size = 550 bp)
for (one_region in regions) {
pr2_match_summary_primer_set_region_long <- filter(pr2_match_summary_primer_set_long, region ==
one_region)
pr2_match_summary_primer_set_region <- filter(pr2_match_summary_primer_set, region == one_region)
pr2_match_region <- filter(pr2_match_final, region == one_region)
g <- ggplot(pr2_match_summary_primer_set_region_long) + geom_col(aes(x = reorder(primer_label,
primer_set_id), y = pct_seq, fill = fct_reorder(pct_category, pct_category_order)),
width = 0.7, position = "dodge") + theme_bw() + xlab("Primer set") + ylab("% of sequences amplified") +
scale_fill_manual(name = "% amplified", values = c(pct_amplicons = "black", pct_fwd = "blue",
pct_rev = "red"), labels = c("Primer fwd", "Primer rev", "Amplicons")) + ggtitle(str_c(one_region,
" - Percentage of sequences recovered")) + theme(axis.text.x = element_text(angle = 45,
hjust = 1))
print(g)
g <- ggplot(filter(pr2_match_summary_primer_set_region, !is.nan(ampli_size_mean))) + geom_point(aes(x = reorder(primer_label,
primer_set_id), y = ampli_size_mean), colour = "black") + geom_errorbar(aes(x = reorder(primer_label,
primer_set_id), ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean - ampli_size_sd)) +
theme_bw() + xlab("Primer set") + ylab("Amplicon size (bp)") + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) +
ggtitle(str_c(one_region, " - Amplicon size - Lines correspond to limits for Illumina 2x250 and 2x300 respectively")) +
geom_hline(yintercept = c(450, 550), linetype = c(2, 3)) + theme(axis.text.x = element_text(angle = 45,
hjust = 1))
print(g)
g <- ggplot(pr2_match_region) + geom_boxplot(aes(x = reorder(primer_label, primer_set_id),
y = ampli_size), colour = "black", outlier.alpha = 0.3) + theme_bw() + xlab("Primer set") +
ylab("Amplicon size (bp)") + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) +
ggtitle(str_c(one_region, " - Amplicon size - Lines correspond to limits for Illumina 2x250 and 2x300 respectively")) +
geom_hline(yintercept = c(450, 550), linetype = c(2, 3)) + theme(axis.text.x = element_text(angle = 45,
hjust = 1))
print(g)
}9.2 By supergroup
Comments
- Excavata have a very different patterns from the rest of the group. They are not amplified by quite a few primer sets. They have also bigger amplicons
- Some groups exhibit higher variability in amplicon size (e.g Chlorophyta)
for (one_region in regions) {
pr2_match_summary_primer_set_sg_region <- filter(pr2_match_summary_primer_set_sg, (region ==
one_region) & (n_seq > 20))
g <- ggplot(pr2_match_summary_primer_set_sg_region) + geom_col(aes(x = str_c(supergroup,
" - n= ", n_seq), y = pct_amplicons), fill = "grey", position = "dodge") + theme_bw() +
coord_flip() + ylab("% of sequences amplified") + xlab("Supergroup") + ggtitle(str_c(one_region,
" - % amplified per Supergroup")) + facet_wrap(~primer_label)
print(g)
g <- ggplot(filter(pr2_match_summary_primer_set_sg_region, !is.nan(ampli_size_mean))) +
geom_point(aes(x = str_c(supergroup, " - n= ", n_seq), y = ampli_size_mean), colour = "black") +
theme_bw() + coord_flip() + geom_errorbar(aes(x = str_c(supergroup, " - n= ", n_seq),
ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean - ampli_size_sd)) +
xlab("Supergroup") + ylab("Amplicon size (bp)") + ggtitle(str_c(one_region, " - Amplicon size - Lines correspond to limits for Illumina 2x250 and 2x300 respectively")) +
geom_hline(yintercept = c(450, 550), linetype = 2) + facet_wrap(~primer_label, scales = "free_x")
print(g)
}9.3 By class for autotrophs
for (one_primer_set in primer_sets$primer_set_id) {
pr2_match_summary_filtered <- filter(pr2_match_summary_primer_set_class, (n_seq > 20) &
(division %in% c("Haptophyta", "Dinoflagellata", "Chlorophyta", "Ochrophyta", "Cryptophyta")) &
(primer_set_id == one_primer_set))
if (nrow(pr2_match_summary_filtered) > 0) {
g <- ggplot(pr2_match_summary_filtered) + geom_col(data = pr2_match_summary_filtered,
aes(x = str_c(division, "-", class, " - n= ", n_seq), y = pct_amplicons), fill = "grey",
position = "dodge") + theme_bw() + coord_flip() + ylab("% of sequences amplified") +
xlab("Class") + ggtitle(str_c("Set -", one_primer_set, " - % amplified per Class"))
print(g)
g <- ggplot(filter(pr2_match_summary_filtered, !is.nan(ampli_size_mean))) + geom_point(aes(x = str_c(division,
"-", class, " - n= ", n_seq), y = ampli_size_mean), colour = "black") + theme_bw() +
coord_flip() + geom_errorbar(aes(x = str_c(division, "-", class, " - n= ", n_seq),
ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean - ampli_size_sd)) +
xlab("Class") + ylab("Amplicon size (bp)") + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) +
ggtitle(str_c("Set -", one_primer_set, " - Amplicon size - Lines correspond to limits for Illumina 2x250 and 2x300 respectively")) +
geom_hline(yintercept = c(450, 550), linetype = 2)
print(g)
}
}10 Specific analyses
10.1 Specific et sets for Opisthokonta
- 35 UnNonMet
- 16 Piredda
for (one_primer_set in c(16, 17, 35)) {
pr2_match_summary_filtered <- filter(pr2_match_summary_primer_set_class, (n_seq > 20) &
(supergroup %in% c("Opisthokonta")) & (primer_set_id == one_primer_set))
g <- ggplot(pr2_match_summary_filtered) + geom_col(data = pr2_match_summary_filtered, aes(x = str_c(division,
"-", class, " - n= ", n_seq), y = pct_amplicons), fill = "grey", position = "dodge") +
theme_bw() + coord_flip() + ylab("% of sequences amplified") + xlab("Class") + ggtitle(str_c("Set -",
one_primer_set, " - % amplified per Class"))
print(g)
g <- ggplot(filter(pr2_match_summary_filtered, !is.nan(ampli_size_mean))) + geom_point(aes(x = str_c(division,
"-", class, " - n= ", n_seq), y = ampli_size_mean), colour = "black") + coord_flip() +
geom_errorbar(aes(x = str_c(division, "-", class, " - n= ", n_seq), ymax = ampli_size_mean +
ampli_size_sd, ymin = ampli_size_mean - ampli_size_sd)) + xlab("Class") + ylab("Amplicon size (bp)") +
# scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) +
ggtitle(str_c("Set -", one_primer_set, " - Amplicon size - Lines correspond to limits for Illumina 2x250 and 2x300 respectively")) +
geom_hline(yintercept = c(450, 550), linetype = 2)
print(g)
}